library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(httr)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(rjson)
## 
## Attaching package: 'rjson'
## The following objects are masked from 'package:jsonlite':
## 
##     fromJSON, toJSON
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(leaflet)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(htmltools)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Pulling data from API (shout-out to Chris)

Chris provided this part of code. The idea is to pull all the session ids connected to the username NYCEJA, and saved in an R object df

real <- c("http://aircasting.org/api/realtime/sessions.json?page=0&page_size=500&q[measurements]=true&q[time_from]=0&q[time_to]=2552648500&q[usernames]=NYCEJA")

tt <- jsonlite::fromJSON(real)
t <- tt$streams$'AirBeam2-PM2.5'
t <- data.table(t)
ID <- t[!is.na(t$id)]$id

name <- data.frame(tt$title,t$id)
name <- name[!is.na(name$t.id),]
colnames(name) <- c("title", "id")

dt <- list()
for (i in 1:length(ID)) {
  sess <- paste0("http://aircasting.org/api/realtime/stream_measurements.json/?end_date=2281550369000&start_date=0&stream_ids[]=",ID[i])
  s1 <- jsonlite::fromJSON(sess)
  s1 <- data.table(s1)
  s1$ID <- ID[i]
  dt[[i]] <- data.table(s1)
}

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:lubridate':
## 
##     here
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
df <- ldply(dt, data.frame)

check df. This data contains measurement from 23 locations (represented by 23 IDs), generated every minute from 2018-6 to 2019-3. Note that the earliest date is in 2000 which does not make sense. This is more likely to be 2019, so I will change all 2000 to 2019.

# check data
head(df) %>% 
  knitr::kable()
time value latitude longitude ID
2019-03-27T11:52:18Z 1 40.81336 -73.95920 297507
2019-03-27T11:53:18Z 3 40.81336 -73.95920 297507
2000-01-01T01:44:01Z 2 40.81390 -73.95864 268832
2000-01-01T03:18:42Z 6 40.81390 -73.95864 268832
2000-01-01T03:26:43Z 5 40.81390 -73.95864 268832
2000-01-01T03:27:57Z 5 40.81390 -73.95864 268832

Data cleaning

A bit data cleaning and manipulation

air_data <- df %>% 
  as_tibble() %>% 
  mutate(
    time = str_replace(time, 'T', ' ')
  ) %>% 
  mutate(
    time = str_replace(time, 'Z', '')
  ) %>% 
  mutate(
    time = str_replace(time, '2000', '2019')
  ) %>% 
  mutate(
    time = str_replace(time, '1999', '2018')
  ) %>% 
  mutate(
    time = ymd_hms(time),
    ID = as.character(ID)
  ) %>% 
  dplyr::rename(
    station_id = ID
  ) %>% 
  filter(
    !is.na(time)
  )

Visualization

Spatial

First of all, we want to know where these stations are. I will use the leaflet package and plot these locations on top of OpenStreetMap

location <- df %>% 
  distinct(latitude, longitude) %>% 
  slice(-9, -20) %>% 
  mutate(
    name = str_c(round(latitude, digits = 2), round(longitude, digits = 2), sep = ', ')
  )

leaflet() %>% 
  addTiles() %>% 
  addCircleMarkers(
    data = location,
    lat = ~latitude, lng = ~longitude,
    color = 'green',
    label = ~htmlEscape(name)
  )

Now I will look at each station. And after we have more information of these stations, I will change the size of each circle according to their average PM2.5 level.

Temporal

For each location, we want to know how do PM2.5 change over time. Because there are too many data points, I will reduce them to a an hourly average, and plot each location.

For example, station 3 is at (40.7109366, -73.9596247). Let’s plot it

location_3 <- air_data %>% 
  filter(
     station_id == '246560'
  ) %>% 
  mutate(
    measure_date = as.Date(time),
    measure_hour = hour(time)
  )

loc_3 <- location_3 %>% 
  mutate(
    timepoint = str_c(measure_date, measure_hour, sep = ' ')
  ) %>% 
  arrange(measure_date, measure_hour) %>% 
  mutate(
    row_num = rownames(.)) %>% 
  mutate(
    row_num = as.numeric(row_num)
  ) %>% 
  ungroup() %>% 
  ggplot(aes(x = row_num, y = value)) + geom_line() + geom_smooth()

ggplotly(loc_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'